Linear Model Selection, Regularization and Moving Beyond Linearity
Module 09
Ray J. Hoobler
Libraries
Code
library(tidyverse)library(broom) # For tidying model outputlibrary(leaps) # For best subset selection library(readxl) # For reading Excel fileslibrary(patchwork) # For combining plotslibrary(latex2exp) # For LaTeX expressionslibrary(rlang) # For working with formulaslibrary(glmnet) # For ridge regression and the lasso
Given is the variable name, variable type, the measurement unit and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database.
Name – Data Type – Measurement – Description
Cement (component 1) – quantitative – kg in a m3 mixture – Input Variable
Blast Furnace Slag (component 2) – quantitative – kg in a m3 mixture – Input Variable
Fly Ash (component 3) – quantitative – kg in a m3 mixture – Input Variable
Water (component 4) – quantitative – kg in a m3 mixture – Input Variable
Superplasticizer (component 5) – quantitative – kg in a m3 mixture – Input Variable
Coarse Aggregate (component 6) – quantitative – kg in a m3 mixture – Input Variable
Fine Aggregate (component 7) – quantitative – kg in a m3 mixture – Input Variable
where \(d\) is the number of predictors in the model and \(\hat{\sigma}^2\) is an estimate of the variance of the error term.
Working with Combinations
Walk through the following code to examing how we can work with combinations of predictors.
Code
# Initialize an empty list to store model informationmodel_list <-list()total_combinations <-0for (i in1:8) { concrete_predictors <-combn(concrete_column_names[1:8], i) num_combinations <-ncol(concrete_predictors)cat(sprintf("\nNumber of predictors: %d\n", i))cat(sprintf("Number of combinations: %d\n", num_combinations)) total_combinations <- total_combinations + num_combinationsfor (j in1:num_combinations) { formula <-reformulate(concrete_predictors[,j], response ="strength") model_combination <-lm(formula, data = concrete)# Add model information to the list model_list[[length(model_list) +1]] <-list(num_predictors = i,combination =paste(concrete_predictors[,j], collapse =", "),formula =paste(deparse(formula), collapse =" "), # Ensure single stringmodel = model_combination ) }}
Number of predictors: 1
Number of combinations: 8
Number of predictors: 2
Number of combinations: 28
Number of predictors: 3
Number of combinations: 56
Number of predictors: 4
Number of combinations: 70
Number of predictors: 5
Number of combinations: 56
Number of predictors: 6
Number of combinations: 28
Number of predictors: 7
Number of combinations: 8
Number of predictors: 8
Number of combinations: 1
Code
# Convert the list to a tibblemodel_results <- tibble::tibble(num_predictors =map_int(model_list, "num_predictors"),combination =map_chr(model_list, "combination"),formula =map_chr(model_list, "formula"),model =map(model_list, "model"))cat(sprintf("\nTotal number of combinations: %d\n", total_combinations))
Total number of combinations: 255
Code
cat(sprintf("Number of rows in model_results: %d\n", nrow(model_results)))
Number of rows in model_results: 255
Code
# Display the resulting tibblemodel_results
Custom Function to Split Data and Compare Models
For many cases, you’ll be able to use package functions to perform model selection.
Code
# Created with the help of Calude.aicompare_models <-function(data, response_var, predictor_vars, proportion =0.8, seed =NULL) {# Set seed if providedif (!is.null(seed)) set.seed(seed)# Capture the predictor variables pred_vars <-enquo(predictor_vars)# Select the specified columns selected_data <- data |>select(!!response_var, !!pred_vars)# Get column names all_column_names <-colnames(selected_data) response_name <-as.character(response_var) predictor_names <-setdiff(all_column_names, response_name)# Create train/test split n <-nrow(selected_data) train_indices <-sample(1:n, size =round(proportion * n)) train_data <- selected_data[train_indices, ] test_data <- selected_data[-train_indices, ]# Initialize an empty list to store model information model_list <-list() total_combinations <-0for (i in1:length(predictor_names)) { predictor_combinations <-combn(predictor_names, i) num_combinations <-ncol(predictor_combinations) total_combinations <- total_combinations + num_combinationsfor (j in1:num_combinations) { formula <-reformulate(predictor_combinations[,j], response = response_name) model_combination <-lm(formula, data = train_data)# Make predictions on test data predictions <-predict(model_combination, newdata = test_data)# Calculate RMSE and R-squared on test data rmse <-sqrt(mean((test_data[[response_name]] - predictions)^2)) rsq <-cor(test_data[[response_name]], predictions)^2# Add model information to the list model_list[[length(model_list) +1]] <-list(num_predictors = i,combination =paste(predictor_combinations[,j], collapse =", "),formula =paste(deparse(formula), collapse =" "),model = model_combination,test_rmse = rmse,test_rsq = rsq ) } }# Convert the list to a tibble model_results <- tibble::tibble(num_predictors =map_int(model_list, "num_predictors"),combination =map_chr(model_list, "combination"),formula =map_chr(model_list, "formula"),model =map(model_list, "model"),test_rmse =map_dbl(model_list, "test_rmse"),test_rsq =map_dbl(model_list, "test_rsq") )cat(sprintf("\nTotal number of models: %d\n", nrow(model_results)))return(model_results)}# Example usage:result <-compare_models(concrete, "strength", concrete_column_names, proportion =0.8, seed =42)
Total number of models: 255
Code
# View results:# result |> arrange(test_rmse) |> head(10) # Top 10 models by RMSEresult |>arrange(desc(test_rsq)) |>head(10) # Top 10 models by R-squared
Test Model Results
Code
max_results <- result |>group_by(num_predictors) |>filter(test_rsq ==max(test_rsq)) |>ungroup()result |>ggplot() +geom_point(aes(x = num_predictors, y = test_rsq), alpha =1/3) +geom_line(aes(x = num_predictors, y = test_rsq), data = max_results, color ="red") +geom_point(aes(x = num_predictors, y = test_rsq), data = max_results, color ="red") +labs(title ="Model Selection Using Subset Selection on Test Data",subtitle ="Custom function to compare models based on test data",x ="Number of Predictors",y ="R<sup>2</sup>" ) +scale_x_continuous(breaks =1:8) +theme_light() +theme(plot.title = ggtext::element_markdown(size =16),plot.title.position ="plot",plot.subtitle = ggtext::element_markdown(size =12),axis.title = ggtext::element_markdown(size =10) )
Assignment: Subset Selection Methods
ISLR Lab: 6.5.1 Subset Selection Methods
Shrinkage Methods
Ridge Regression
Lasso
Elastic Net
Selecting the Tuning Parameter
Ridge Regression
Recall that linear regression minimizes RSS to find the best-fitting line.
\(\lambda \ge 0\) is a tuning parameter that controls the amount of shrinkage and is determined using cross-validation.
Note
We will use the glmnet package to perform ridge regression.
Ridge Regression Penalty Term
The penalty term is often referred to as l2 regularization and has a shorthand notation of
\[
|| \beta ||_2^2 = \sum_{j=1}^{p} \beta_j^2
\]
Scaling the Data
Standardizing the Predictors (ISLR, 239)
The ridge regression coefficient estimates can change substantially when multiplying a given predictor by a constant. \(X_j \hat{\beta}_{j,\lambda}^R\) will depend not only on the value of \(\lambda\), but also on the scaling of the \(j\)th predictor. . . the value of \(X_j \hat{\beta}_{j,\lambda}^R\) may even depend on the scaling of the other predictors! Therefore, it is best to apply ridge regression after standardizing the predictors, using the formula
where \(\alpha\) controls the balance between Ridge and Lasso regression.
\(\alpha = 1\) is Lasso regression and is the default in glmnet. \(\alpha = 0\) is Ridge regression.
Assignment: Ridge Regression and the Lasso
ISLR Lab: 6.5.2 Ridge Regression and the Lasso
Readings (1/2)
Dimension Reduction Methods
Principal Components Regression
Partial Least Squares
Principal Components Regression
Partial Least Squares
Readings (2/2)
Considerations in High Dimensions
High-Dimensional Data
What Goes Wrong in High Dimensions?
Regression in High Dimensions
Interpreting Results in High Dimenions
Moving Beyond Linearity
Polynomial Regression (1/)
We can extend linear regression to include polynomial terms where the coefficients are still linear, but the predictors are polynomial terms of the original predictors.
loadcell |>ggplot() +geom_point(aes(x =`Load Level`, y = Response)) +labs(title ="Loadcell Data",subtitle ="Data appears to have a linear relationship",x ="Load Level",y ="Response" ) +theme_light()
Polynomial Regression (4/)
Example 1: Quadratic Regression on Loadcell data
Code
loadcell_lm <-lm(Response ~`Load Level`, data = loadcell)summary(loadcell_lm)
Call:
lm(formula = Response ~ `Load Level`, data = loadcell)
Residuals:
Min 1Q Median 3Q Max
-3.181e-04 -2.075e-04 -2.862e-05 1.983e-04 4.208e-04
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.156e-04 9.025e-05 -7.929 5.97e-09 ***
`Load Level` 1.003e-01 6.725e-06 14909.755 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.000239 on 31 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 2.223e+08 on 1 and 31 DF, p-value: < 2.2e-16
Note: R-squared is 1!
Polynomial Regression (5/)
Example 1: Inspect the resiuals
Code
loadcell_resid <-augment(loadcell_lm)loadcell_resid |>ggplot() +geom_point(aes(x =`Load Level`, y = .resid)) +labs(title ="Loadcell Residuals from Simple Linear Regression",subtitle ="Residuals are not randomly distributed",x ="Load Level",y ="Residuals" ) +theme_light()
Polynomial Regression (6/)
Example 1: Quadratic Regression on Loadcell data
Code
loadcell_lm_quad <-lm(Response ~poly(`Load Level`, 2), data = loadcell)summary(loadcell_lm_quad)
Call:
lm(formula = Response ~ poly(`Load Level`, 2), data = loadcell)
Residuals:
Min 1Q Median 3Q Max
-9.966e-05 -1.466e-05 5.944e-06 2.515e-05 5.595e-05
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.193e+00 6.552e-06 182130.14 <2e-16 ***
poly(`Load Level`, 2)1 3.563e+00 3.764e-05 94658.88 <2e-16 ***
poly(`Load Level`, 2)2 1.314e-03 3.764e-05 34.92 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.764e-05 on 30 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 4.48e+09 on 2 and 30 DF, p-value: < 2.2e-16
Polynomial Regression (7/)
Example 1: Inspect the resiuals of the quadrtic model
Code
set.seed(42) # added so jitter will not change between rendersloadcell_resid_quad <-augment(loadcell_lm_quad)loadcell_resid_quad |>ggplot() +geom_point(aes(x = loadcell$`Load Level`, y = .resid), position =position_jitter(width =0.15),shape =1) +labs(title ="Loadcell Residuals from Quadratic Regression",subtitle ="Residuals are randomly distributed",x ="Load Level",y ="Residuals" ) +theme_light()
Polynomial Regression (8/)
Example 1: Alternative method to construc the polynomial model
Code
loadcell_lm_quad2 <-lm(Response ~`Load Level`+I(`Load Level`^2), data = loadcell)summary(loadcell_lm_quad2)
Call:
lm(formula = Response ~ `Load Level` + I(`Load Level`^2), data = loadcell)
Residuals:
Min 1Q Median 3Q Max
-9.966e-05 -1.466e-05 5.944e-06 2.515e-05 5.595e-05
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.840e-05 2.451e-05 -0.751 0.459
`Load Level` 1.001e-01 4.839e-06 20687.891 <2e-16 ***
I(`Load Level`^2) 7.032e-06 2.014e-07 34.922 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.764e-05 on 30 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 4.48e+09 on 2 and 30 DF, p-value: < 2.2e-16
Assignment: Polynomial Regression
ISLR Lab: 7.8.1 Polynomial Regression
End of Module 9
References
Backup
Best Subset Selection (2/)
Creating a function to do the best subset selection
Code
subset_selection <-function(data, predictors, response, train_prop =0.7, seed =42) {# Ensure inputs are correctif (!is.data.frame(data)) {stop("Input 'data' must be a data frame") }if (!is.character(predictors) ||!all(predictors %in%names(data))) {stop("Input 'predictors' must be a character vector of column names present in the data frame") }if (!is.character(response) ||length(response) !=1||!(response %in%names(data))) {stop("Input 'response' must be a single column name present in the data frame") }# Set seed for reproducibilityset.seed(seed)# Split data into training and test sets train_indices <-sample(1:nrow(data), size =floor(train_prop *nrow(data))) train_data <- data[train_indices, ] test_data <- data[-train_indices, ]# Prepare the formula formula <-as.formula(paste(response, "~", paste(predictors, collapse =" + ")))# Perform subset selection on training data subsets <-regsubsets(formula, data = train_data, nvmax =length(predictors))# Extract results summary_subsets <-summary(subsets)# Create a tibble with results results <-tibble(n_vars =1:length(predictors),adjr2 = summary_subsets$adjr2,cp = summary_subsets$cp,bic = summary_subsets$bic )# Function to calculate test MSE calculate_test_mse <-function(model, test_data, response) { predictions <-predict(model, newdata = test_data)mean((test_data[[response]] - predictions)^2) }# Calculate test MSE for each model test_mse <-sapply(1:length(predictors), function(i) { coef_matrix <-coef(subsets, id = i) variables <-names(coef_matrix)[-1] # Exclude intercept model <-lm(as.formula(paste(response, "~", paste(variables, collapse ="+"))), data = train_data)calculate_test_mse(model, test_data, response) })# Add test MSE to results results <- results |>mutate(test_mse = test_mse)# Find best model for each criterion best_adjr2 <- results |>slice_max(adjr2, n =1) best_cp <- results |>slice_min(cp, n =1) best_bic <- results |>slice_min(bic, n =1) best_test_mse <- results |>slice_min(test_mse, n =1)# Combine results best_models <-bind_rows( best_adjr2 |>mutate(criterion ="Adjusted R-squared"), best_cp |>mutate(criterion ="Mallows' Cp"), best_bic |>mutate(criterion ="BIC"), best_test_mse |>mutate(criterion ="Test MSE") )# Return a list with full results and best modelslist(full_results = results,best_models = best_models,subsets = subsets,train_data = train_data,test_data = test_data )}